customize 你的 GSEA 图

Original JunJunLab 老俊俊的生信笔记 2023-06-15

收录于合集 #R绘图 116个

我是如此相信

1引言

参考前面 Y 叔 enrichplot 包中的 gseaplot2 代码,我们提取数据自己来一步步画图,比如做一些图形的延伸或者个性化绘图。在这里感谢 Y 叔的贡献及代码。

今天来拿自己的数据来一步步绘制常见的图形和一些不一样的 GSEA 图形。

2准备基因集

msigdbr 这个包方便我们直接下载对应的基因集:

# install.packages("msigdbr")
library(tidyverse)
library(msigdbr)
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggrepel)
library(ggplot2)
library(aplot)

# 查看msigdbr包支持的物种
msigdbr_species()

# # A tibble: 20 x 2
#   species_name                    species_common_name
#   <chr>                           <chr>
# 1 Anolis carolinensis             Carolina anole, green anole
# 2 Bos taurus                      bovine, cattle, cow, dairy cow, domestic cattle, domestic cow, o~
# 3 Caenorhabditis elegans          NA
# 4 Canis lupus familiaris          dog, dogs
# 5 Danio rerio                     leopard danio, zebra danio, zebra fish, zebrafish
# 6 Drosophila melanogaster         fruit fly
# 7 Equus caballus                  domestic horse, equine, horse
# 8 Felis catus                     cat, cats, domestic cat
# 9 Gallus gallus                   bantam, chicken, chickens, Gallus domesticus
# 10 Homo sapiens                    human
# 11 Macaca mulatta                  rhesus macaque, rhesus macaques, Rhesus monkey, rhesus monkeys
# 12 Monodelphis domestica           gray short-tailed opossum
# 13 Mus musculus                    house mouse, mouse
# 14 Ornithorhynchus anatinus        duck-billed platypus, duckbill platypus, platypus
# 15 Pan troglodytes                 chimpanzee
# 16 Rattus norvegicus               brown rat, Norway rat, rat, rats
# 17 Saccharomyces cerevisiae        baker's yeast, brewer's yeast, S. cerevisiae
# 18 Schizosaccharomyces pombe 972h- NA
# 19 Sus scrofa                      pig, pigs, swine, wild boar
# 20 Xenopus tropicalis              tropical clawed frog, western clawed frog

所有基因集:

下载:

我这里下载了小鼠的生物过程基因集:

# choose C5 BP gene sets
genesets <-  msigdbr(species = "Mus musculus",
                     category = "C5",
                     subcategory = "BP")

# check
head(genesets,3)
# # A tibble: 3 x 18
# gs_cat gs_subcat gs_name    gene_symbol entrez_gene ensembl_gene human_gene_symb~ human_entrez_ge~
#   <chr>  <chr>     <chr>      <chr>             <int> <chr>        <chr>                       <int>
# 1 C5     GO:BP     GOBP_10_F~ Aasdhppt          67618 ENSMUSG0000~ AASDHPPT                    60496
# 2 C5     GO:BP     GOBP_10_F~ Aldh1l1          107747 ENSMUSG0000~ ALDH1L1                     10840
# 3 C5     GO:BP     GOBP_10_F~ Aldh1l2          216188 ENSMUSG0000~ ALDH1L2                    160428
# # ... with 10 more variables: human_ensembl_gene <chr>, gs_id <chr>, gs_pmid <chr>, gs_geoid <chr>,
# #   gs_exact_source <chr>, gs_url <chr>, gs_description <chr>, taxon_id <int>,
# #   ortholog_sources <chr>, num_ortholog_sources <dbl>

# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)

3读取基因

导入我们的基因及对应的差异倍数:

# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)

# load gene
gene <- read.table('gsea-test-data.txt',header = T) %>%
  arrange(desc(log2FoldChange))

# check
head(gene,3)
#       gene_name log2FoldChange
# 1         Ecscr       6.015527
# 2       Gm32341       5.962540
# 3 B130034C11Rik       5.841789

geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name

# check
head(geneList,3)
#    Ecscr       Gm32341 B130034C11Rik
# 6.015527      5.962540      5.841789

4GSEA 富集分析

因为使用的是自己准备的通路集合,所以这里拿的是 GSEA 函数进行富集分析:

# GO enrich
gseaRes <- GSEA(geneList = geneList,
                TERM2GENE = gmt,
                minGSSize    = 10,
                maxGSSize    = 500,
                pvalueCutoff = 1,
                pAdjustMethod = "BH",
                verbose      = FALSE)

# to dataframe
data_ga <- data.frame(gseaRes) %>%
  filter(pvalue < 0.05)

画个图看看:

# gseaplot2
gseaplot2(gseaRes, geneSetID = 'GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS',
          title = gseaRes$Description['GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS'])

5提取数据

参考之前的,我们先把这个通路对应的基因排序数据提取出来:

gseaScores <- getFromNamespace("gseaScores", "DOSE")

# define function
gsInfo <- function(object, geneSetID) {
  geneList <- object@geneList

  if (is.numeric(geneSetID))
    geneSetID <- object@result[geneSetID, "ID"]

  geneSet <- object@geneSets[[geneSetID]]
  exponent <- object@params[["exponent"]]
  df <- gseaScores(geneList, geneSet, exponent, fortify=TRUE)
  df$ymin <- 0
  df$ymax <- 0
  pos <- df$position == 1
  h <- diff(range(df$runningScore))/20
  df$ymin[pos] <- -h
  df$ymax[pos] <- h
  df$geneList <- geneList

  df$Description <- object@result[geneSetID, "Description"]
  return(df)
}

提取:

# get data
gsdata <- gsInfo(gseaRes, geneSetID = 'GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS')


gsdata1 <- gsdata %>%
  mutate("gene_name" = gene$gene_name) %>%
  filter(position == 1)

# check
head(gsdata1,3)
#     x runningScore position        ymin       ymax geneList                                   Description gene_name
# 1   4   0.05996027        1 -0.02241342 0.02241342 5.803995 GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS     Hkdc1
# 2  29   0.11168581        1 -0.02241342 0.02241342 5.081191 GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS    Entpd3
# 3 122   0.15419642        1 -0.02241342 0.02241342 4.426756 GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS      Dlg2


# colnames
colnames(gsdata1)
# [1] "x"            "runningScore" "position"     "ymin"         "ymax"         "geneList"
# [7] "Description"  "gene_name"

6经典图形绘制

这里我们尝试一步步绘制上面常见的图形出来。

curve plot

# plot
pcurve <- ggplot(gsdata,aes(x = x,y = runningScore,color = runningScore)) +
  geom_hline(yintercept = 0,size = 1,color = 'black',
             lty = 'dashed') +
  geom_line() +
  # geom_segment(data = gsdata1,aes(xend = x,yend = 0)) +
  theme_bw(base_size = 14) +
  scale_color_gradient(low = '#76BA99',high = '#EB4747') +
  scale_x_continuous(expand = c(0,0)) +
  # scale_y_continuous(expand = c(0,0)) +
  theme(legend.position = 'none',
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.line.x = element_blank(),
        axis.title.x = element_blank(),
        legend.background = element_rect(fill = "transparent"),
        plot.margin = margin(t = .2,r = .2, b = 0,l = .2,unit = "cm")) +
  ylab('Running Enrichment Score')

pcurve

segment plot

pseg <- ggplot(gsdata,aes(x = x,y = runningScore)) +
  geom_segment(data = gsdata1,
               aes(x = x,xend = x,y = 0,yend = 1),
               color = 'black',show.legend = F) +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) +
  theme_bw(base_size = 14) +
  theme(axis.ticks = element_blank(),
        axis.text = element_blank(),
        axis.title.y = element_blank(),
        panel.grid = element_blank(),
        axis.line.x = element_blank(),
        plot.margin = margin(t = 0,r = .2,b = .2,l = .2,unit = "cm")) +
  xlab('Rank in Ordered Dataset')

pseg

segment and heatmap

v <- seq(1, sum(gsdata$position), length.out = 9)
inv <- findInterval(rev(cumsum(gsdata$position)), v)
if (min(inv) == 0) inv <- inv + 1

col <- c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))

# ymin <- min(p2$data$ymin)
# yy <- max(p2$data$ymax - p2$data$ymin) * .3
ymin <- 0
yy <- 0.3
xmin <- which(!duplicated(inv))
xmax <- xmin + as.numeric(table(inv)[as.character(unique(inv))])
d <- data.frame(ymin = ymin, ymax = yy,
                xmin = xmin,
                xmax = xmax,
                col = col[unique(inv)])

pseg_ht <- pseg + geom_rect(
  aes_(xmin = ~xmin,xmax = ~xmax,
       ymin = ~ymin,ymax = ~ymax,
       fill = ~I(col)),
  data = d,
  alpha = 0.8,
  inherit.aes = FALSE)

pseg_ht

gene rank plot

# add gene rank
pseg_ht1 <- pseg_ht + xlab('') +
  theme(axis.title.x = element_blank(),
        plot.margin = margin(t = -.1,r = .2,b = 0,l = .2,unit = "cm"))

prank <- ggplot(gsdata,aes(x = x,y = geneList)) +
  geom_col(width = 1,fill = 'grey80',color = NA) +
  # geom_col(aes(fill = geneList),
  #          width = 1,color = NA,show.legend = F) +
  # scale_fill_gradient2(low = col[1],mid = 'white',high = col[length(col)],midpoint = 0) +
  geom_hline(yintercept = 0,size = 0.8,color = 'black',
             lty = 'dashed') +
  theme_bw(base_size = 14) +
  theme(panel.grid = element_blank(),
        plot.margin = margin(t = -.1,r = .2,b = .2,l = .2,unit = "cm")) +
  coord_cartesian(expand = 0) +
  ylab('Ranked List') +
  xlab('Rank in Ordered Dataset')

prank

拼图

# combine
pall <- aplot::plot_list(gglist = list(pcurve,pseg_ht1,prank),
                 ncol = 1, heights = c(0.5,0.2,0.3))

pall

最后我们就可以得到一张完整的 GSEA 常见的图形了。

7绘图拓展

有了以上的基础,我们后面可以做一些更多的事情,改改参数,加加代码啥的。

添加基因名

# add gene name
geneLabel <- gsdata1 %>% arrange(desc(runningScore)) %>%
  head(20)

plabel <- pcurve +
  geom_segment(data = geneLabel,aes(xend = x,yend = 0),
               color = 'red') +
  geom_text_repel(data = geneLabel,
                  aes(label = gene_name),
                  force = 20,
                  max.overlaps = 50,
                  # nudge_y = 0.2,
                  size = 4,
                  fontface = 'italic')

aplot::plot_list(gglist = list(plabel,pseg_ht1,prank),
                 ncol = 1, heights = c(0.5,0.2,0.3))

新的 style

panother <- ggplot(gsdata,aes(x = x,y = runningScore,color = runningScore)) +
  geom_hline(yintercept = 0,size = 0.8,color = 'black',
             lty = 'dashed') +
  geom_point() +
  geom_line() +
  geom_segment(data = gsdata1,aes(xend = x,yend = 0)) +
  theme_bw(base_size = 14) +
  # scale_color_gradient(low = '#336699',high = '#993399') +
  scale_color_gradient2(low = '#336699',mid = 'white',high = '#993399',midpoint = 0.2) +
  scale_x_continuous(expand = c(0,0)) +
  theme(legend.position = 'none',
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.line.x = element_blank(),
        axis.title.x = element_blank(),
        legend.background = element_rect(fill = "transparent"),
        plot.margin = margin(t = .2,r = .2, b = 0,l = .2,unit = "cm")) +
  ylab('Running Enrichment Score')

panother

添加基因名

# add gene name
panother_label <-
  panother +
  geom_text_repel(data = geneLabel,
                  aes(label = gene_name),
                  force = 20,
                  max.overlaps = 50,
                  # nudge_y = 0.15,
                  size = 4,
                  # color = 'black',
                  fontface = 'italic')

panother_label

添加热图

# new color
color <- rev(colorRampPalette(c("#336699","white", "#993399"))(10))

ht <- ggplot(gsdata,aes(x = x,y = runningScore)) +
  geom_rect(aes_(xmin = ~xmin,xmax = ~xmax,
                 ymin = ~ymin,ymax = ~ymax,
                 fill = ~I(color)),
            data = d,
            alpha = 0.8,
            inherit.aes = FALSE) +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) +
  theme_bw(base_size = 14) +
  theme(panel.grid = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        axis.title = element_blank(),
        plot.margin = margin(t = 0,r = .2, b = .2,l = .2,unit = "cm"))

# combine
aplot::plot_list(gglist = list(panother_label,ht),
                 ncol = 1, heights = c(0.9,0.1))

8结尾

大家可以自由发挥了。

欢迎加入生信交流群。加我微信我也拉你进微信群聊 老俊俊生信交流群 (微信交流群需收取20元入群费用(防止骗子和便于管理))。

老俊俊微信：

知识星球：

所以今天你学习了吗？

今天的分享就到这里了，敬请期待下一篇！

最后欢迎大家分享转发，您的点赞是对我的鼓励和肯定！

如果觉得对您帮助很大，赏杯快乐水喝喝吧！

往期回顾

◀GseaVis 优雅的可视化 GSEA 富集结果

◀GSEA 图是如何画出来的? (源码解析)

◀clusterProfiler 的可视化大全

◀clusterProfiler 做 GSEA 富集分析

◀深度神经网络学习对单细胞数据进行清洗去噪

◀ggpie 解决你的所有饼图绘制

◀Bigwig 可视化用 tackPlotR 试试看?

◀gggsea 个性化绘制 GSEA 图

◀scRNAtoolVis 绘制单细胞 Marker 基因均值表达热图

◀给你 UMAP 坐标重复文章一模一样的图?

◀...

一把短刀，怎么就让他连捅18人？！

听纪委朋友说，有的领导干部在被抽掉鞋带和皮带后，一下就崩溃了，甚至个别胆小者顿时大小便失禁……

上海超市血案：背后缘由让人揪心

为啥一线城市只有广州取消限购？是因为穷吗

野村：牛市可能重蹈2015年的崩盘

customize 你的 GSEA 图

1引言

2准备基因集

3读取基因

4GSEA 富集分析

5提取数据

6经典图形绘制

curve plot

segment plot

segment and heatmap

gene rank plot

拼图

7绘图拓展

添加基因名

新的 style

添加基因名

添加热图

8结尾

今天的分享就到这里了，敬请期待下一篇！

您可能也对以下帖子感兴趣

一把短刀，怎么就让他连捅18人？！

听纪委朋友说，有的领导干部在被抽掉鞋带和皮带后，一下就崩溃了，甚至个别胆小者顿时大小便失禁……

上海超市血案：背后缘由让人揪心

为啥一线城市只有广州取消限购？是因为穷吗

野村：牛市可能重蹈2015年的崩盘

生成图片，分享到微信朋友圈

customize 你的 GSEA 图

1引言

2准备基因集

3读取基因

4GSEA 富集分析

5提取数据

6经典图形绘制

curve plot

segment plot

segment and heatmap

gene rank plot

拼图

7绘图拓展

添加基因名

新的 style

添加基因名

添加热图

8结尾

今天的分享就到这里了，敬请期待下一篇！

您可能也对以下帖子感兴趣